import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import random undersampling and other necessary libraries
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from IPython.display import Image
# import SVM libraries
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
df=pd.read_csv('dmc2010_train.txt',delimiter='\;')
<ipython-input-2-d7e762b91827>:1: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
df=pd.read_csv('dmc2010_train.txt',delimiter='\;')
df
| customernumber | date | salutation | title | domain | datecreated | newsletter | model | paymenttype | deliverytype | ... | w2 | w3 | w4 | w5 | w6 | w7 | w8 | w9 | w10 | target90 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 41191 | 2008-12-01 | 0 | 0 | 9 | 2008-12-01 | 0 | 2 | 2 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 38860 | 2008-12-16 | 1 | 0 | 4 | 2008-12-16 | 0 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 61917 | 2008-08-19 | 0 | 0 | 12 | 2008-08-19 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 3 | 40647 | 2008-06-16 | 1 | 0 | 8 | 2008-06-16 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 1347 | 2008-08-08 | 0 | 0 | 1 | 2008-08-08 | 0 | 1 | 1 | 1 | ... | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 32423 | 7784 | 2008-10-21 | 1 | 0 | 8 | 2008-10-21 | 0 | 1 | 2 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 32424 | 41695 | 2008-11-09 | 1 | 0 | 4 | 2008-11-09 | 0 | 1 | 3 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 32425 | 7612 | 2008-04-12 | 2 | 0 | 9 | 2008-04-12 | 0 | 3 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 32426 | 31941 | 2008-11-15 | 0 | 0 | 12 | 2008-11-15 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 32427 | 58849 | 2008-07-28 | 1 | 0 | 5 | 2008-07-28 | 0 | 2 | 2 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
32428 rows × 38 columns
df['date'] = pd.to_datetime(df['date'],
format = '%Y-%m-%dT%H:%M:%SZ',
errors = 'coerce')
df['datecreated'] = pd.to_datetime(df['datecreated'],
format = '%Y-%m-%dT%H:%M:%SZ',
errors = 'coerce')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32428 entries, 0 to 32427 Data columns (total 38 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customernumber 32428 non-null int64 1 date 0 non-null datetime64[ns] 2 salutation 32428 non-null int64 3 title 32428 non-null int64 4 domain 32428 non-null int64 5 datecreated 0 non-null datetime64[ns] 6 newsletter 32428 non-null int64 7 model 32428 non-null int64 8 paymenttype 32428 non-null int64 9 deliverytype 32428 non-null int64 10 invoicepostcode 32428 non-null int64 11 delivpostcode 1392 non-null object 12 voucher 32428 non-null int64 13 advertisingdatacode 6523 non-null object 14 case 32428 non-null int64 15 numberitems 32428 non-null int64 16 gift 32428 non-null int64 17 entry 32428 non-null int64 18 points 32428 non-null int64 19 shippingcosts 32428 non-null int64 20 deliverydatepromised 32428 non-null object 21 deliverydatereal 32428 non-null object 22 weight 32428 non-null int64 23 remi 32428 non-null int64 24 cancel 32428 non-null int64 25 used 32428 non-null int64 26 w0 32428 non-null int64 27 w1 32428 non-null int64 28 w2 32428 non-null int64 29 w3 32428 non-null int64 30 w4 32428 non-null int64 31 w5 32428 non-null int64 32 w6 32428 non-null int64 33 w7 32428 non-null int64 34 w8 32428 non-null int64 35 w9 32428 non-null int64 36 w10 32428 non-null int64 37 target90 32428 non-null int64 dtypes: datetime64[ns](2), int64(32), object(4) memory usage: 9.4+ MB
df.describe()
| customernumber | salutation | title | domain | newsletter | model | paymenttype | deliverytype | invoicepostcode | voucher | ... | w2 | w3 | w4 | w5 | w6 | w7 | w8 | w9 | w10 | target90 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | ... | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 | 32428.000000 |
| mean | 33389.298569 | 0.541569 | 0.006969 | 7.517115 | 0.169483 | 1.646910 | 1.000987 | 0.201955 | 48.752282 | 0.162020 | ... | 0.276644 | 0.018903 | 0.047027 | 0.180986 | 0.027908 | 0.023128 | 0.000185 | 0.164981 | 0.092883 | 0.186598 |
| std | 19148.090449 | 0.657044 | 0.083192 | 3.683945 | 0.375184 | 0.825981 | 1.092677 | 0.401465 | 24.361425 | 0.368475 | ... | 1.353981 | 0.253596 | 0.434265 | 0.561751 | 0.299862 | 0.401782 | 0.013601 | 0.836705 | 0.610509 | 0.389594 |
| min | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 16802.750000 | 0.000000 | 0.000000 | 4.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 30.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 33552.500000 | 0.000000 | 0.000000 | 9.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 47.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 75% | 50034.250000 | 1.000000 | 0.000000 | 11.000000 | 0.000000 | 2.000000 | 2.000000 | 0.000000 | 66.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| max | 66251.000000 | 2.000000 | 1.000000 | 12.000000 | 1.000000 | 3.000000 | 3.000000 | 1.000000 | 99.000000 | 1.000000 | ... | 90.000000 | 15.000000 | 36.000000 | 14.000000 | 27.000000 | 55.000000 | 1.000000 | 48.000000 | 50.000000 | 1.000000 |
8 rows × 32 columns
df.columns
Index(['customernumber', 'date', 'salutation', 'title', 'domain',
'datecreated', 'newsletter', 'model', 'paymenttype', 'deliverytype',
'invoicepostcode', 'delivpostcode', 'voucher', 'advertisingdatacode',
'case', 'numberitems', 'gift', 'entry', 'points', 'shippingcosts',
'deliverydatepromised', 'deliverydatereal', 'weight', 'remi', 'cancel',
'used', 'w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6', 'w7', 'w8', 'w9',
'w10', 'target90'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32428 entries, 0 to 32427 Data columns (total 38 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 customernumber 32428 non-null int64 1 date 0 non-null datetime64[ns] 2 salutation 32428 non-null int64 3 title 32428 non-null int64 4 domain 32428 non-null int64 5 datecreated 0 non-null datetime64[ns] 6 newsletter 32428 non-null int64 7 model 32428 non-null int64 8 paymenttype 32428 non-null int64 9 deliverytype 32428 non-null int64 10 invoicepostcode 32428 non-null int64 11 delivpostcode 1392 non-null object 12 voucher 32428 non-null int64 13 advertisingdatacode 6523 non-null object 14 case 32428 non-null int64 15 numberitems 32428 non-null int64 16 gift 32428 non-null int64 17 entry 32428 non-null int64 18 points 32428 non-null int64 19 shippingcosts 32428 non-null int64 20 deliverydatepromised 32428 non-null object 21 deliverydatereal 32428 non-null object 22 weight 32428 non-null int64 23 remi 32428 non-null int64 24 cancel 32428 non-null int64 25 used 32428 non-null int64 26 w0 32428 non-null int64 27 w1 32428 non-null int64 28 w2 32428 non-null int64 29 w3 32428 non-null int64 30 w4 32428 non-null int64 31 w5 32428 non-null int64 32 w6 32428 non-null int64 33 w7 32428 non-null int64 34 w8 32428 non-null int64 35 w9 32428 non-null int64 36 w10 32428 non-null int64 37 target90 32428 non-null int64 dtypes: datetime64[ns](2), int64(32), object(4) memory usage: 9.4+ MB
Image(filename="c-dtyps.png")
df.mean()
customernumber 33389.298569 salutation 0.541569 title 0.006969 domain 7.517115 newsletter 0.169483 model 1.646910 paymenttype 1.000987 deliverytype 0.201955 invoicepostcode 48.752282 voucher 0.162020 case 2.934378 numberitems 2.019551 gift 0.004564 entry 0.414642 points 0.000000 shippingcosts 0.150611 weight 637.920809 remi 0.059979 cancel 0.061613 used 0.068860 w0 0.902122 w1 0.404342 w2 0.276644 w3 0.018903 w4 0.047027 w5 0.180986 w6 0.027908 w7 0.023128 w8 0.000185 w9 0.164981 w10 0.092883 target90 0.186598 dtype: float64
df.std()
customernumber 19148.090449 date NaT salutation 0.657044 title 0.083192 domain 3.683945 datecreated NaT newsletter 0.375184 model 0.825981 paymenttype 1.092677 deliverytype 0.401465 invoicepostcode 24.361425 voucher 0.368475 case 1.31927 numberitems 1.726046 gift 0.067404 entry 0.492668 points 0.0 shippingcosts 0.357674 weight 724.358131 remi 0.38874 cancel 0.306833 used 0.474444 w0 1.654767 w1 1.410395 w2 1.353981 w3 0.253596 w4 0.434265 w5 0.561751 w6 0.299862 w7 0.401782 w8 0.013601 w9 0.836705 w10 0.610509 target90 0.389594 dtype: object
Colums types:
df.dtypes
customernumber int64 date datetime64[ns] salutation int64 title int64 domain int64 datecreated datetime64[ns] newsletter int64 model int64 paymenttype int64 deliverytype int64 invoicepostcode int64 delivpostcode object voucher int64 advertisingdatacode object case int64 numberitems int64 gift int64 entry int64 points int64 shippingcosts int64 deliverydatepromised object deliverydatereal object weight int64 remi int64 cancel int64 used int64 w0 int64 w1 int64 w2 int64 w3 int64 w4 int64 w5 int64 w6 int64 w7 int64 w8 int64 w9 int64 w10 int64 target90 int64 dtype: object
df.max()
customernumber 66251 date NaT salutation 2 title 1 domain 12 datecreated NaT newsletter 1 model 3 paymenttype 3 deliverytype 1 invoicepostcode 99 voucher 1 case 5 numberitems 50 gift 1 entry 1 points 0 shippingcosts 1 deliverydatepromised 4746-11-26 deliverydatereal 2009-12-30 weight 20076 remi 19 cancel 17 used 19 w0 99 w1 84 w2 90 w3 15 w4 36 w5 14 w6 27 w7 55 w8 1 w9 48 w10 50 target90 1 dtype: object
df.min()
customernumber 1 date NaT salutation 0 title 0 domain 0 datecreated NaT newsletter 0 model 1 paymenttype 0 deliverytype 0 invoicepostcode 0 voucher 0 case 1 numberitems 1 gift 0 entry 0 points 0 shippingcosts 0 deliverydatepromised 2008-04-02 deliverydatereal 0000-00-00 weight 0 remi 0 cancel 0 used 0 w0 0 w1 0 w2 0 w3 0 w4 0 w5 0 w6 0 w7 0 w8 0 w9 0 w10 0 target90 0 dtype: object
df.isna().sum()
customernumber 0 date 32428 salutation 0 title 0 domain 0 datecreated 32428 newsletter 0 model 0 paymenttype 0 deliverytype 0 invoicepostcode 0 delivpostcode 31036 voucher 0 advertisingdatacode 25905 case 0 numberitems 0 gift 0 entry 0 points 0 shippingcosts 0 deliverydatepromised 0 deliverydatereal 0 weight 0 remi 0 cancel 0 used 0 w0 0 w1 0 w2 0 w3 0 w4 0 w5 0 w6 0 w7 0 w8 0 w9 0 w10 0 target90 0 dtype: int64
df.isna().sum().sum()
121797
df.corr()
| customernumber | salutation | title | domain | newsletter | model | paymenttype | deliverytype | invoicepostcode | voucher | ... | w2 | w3 | w4 | w5 | w6 | w7 | w8 | w9 | w10 | target90 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| customernumber | 1.000000 | 0.001253 | 0.001822 | -0.004371 | -0.000181 | -0.000049 | 0.004337 | 0.002176 | 0.007331 | -0.004133 | ... | 0.001725 | -0.005051 | 0.008476 | 0.002725 | -0.003050 | 0.002238 | -0.001200 | 0.005347 | 0.009121 | 0.001242 |
| salutation | 0.001253 | 1.000000 | 0.033064 | 0.115485 | -0.059480 | -0.069440 | 0.106040 | -0.053046 | 0.011715 | -0.036099 | ... | -0.031698 | 0.018696 | 0.009198 | 0.023606 | 0.011877 | 0.005003 | 0.006041 | -0.004678 | -0.014162 | -0.028074 |
| title | 0.001822 | 0.033064 | 1.000000 | 0.011686 | -0.004252 | -0.010413 | 0.032492 | -0.007056 | 0.004702 | 0.007428 | ... | -0.010820 | 0.003987 | 0.016536 | -0.007854 | 0.005801 | -0.001132 | -0.001140 | 0.010506 | -0.008495 | -0.001114 |
| domain | -0.004371 | 0.115485 | 0.011686 | 1.000000 | -0.026329 | -0.002483 | 0.014988 | -0.003161 | -0.001469 | -0.028032 | ... | -0.010177 | 0.011289 | 0.009472 | -0.008717 | 0.007259 | 0.004525 | -0.003140 | 0.003996 | 0.015541 | 0.008615 |
| newsletter | -0.000181 | -0.059480 | -0.004252 | -0.026329 | 1.000000 | 0.056183 | 0.001172 | 0.020485 | 0.000238 | 0.002350 | ... | -0.000269 | 0.008138 | -0.003494 | 0.013799 | -0.001201 | -0.003705 | 0.024071 | 0.011323 | 0.002762 | 0.083011 |
| model | -0.000049 | -0.069440 | -0.010413 | -0.002483 | 0.056183 | 1.000000 | -0.024386 | 0.357522 | -0.017881 | -0.042948 | ... | 0.034043 | -0.045133 | -0.000218 | -0.192192 | 0.002683 | -0.002247 | -0.007910 | -0.027977 | -0.009326 | 0.048831 |
| paymenttype | 0.004337 | 0.106040 | 0.032492 | 0.014988 | 0.001172 | -0.024386 | 1.000000 | -0.000454 | 0.017520 | -0.063817 | ... | -0.028199 | 0.064036 | 0.095178 | 0.143398 | 0.082177 | 0.042727 | 0.020738 | 0.009638 | 0.027184 | -0.006011 |
| deliverytype | 0.002176 | -0.053046 | -0.007056 | -0.003161 | 0.020485 | 0.357522 | -0.000454 | 1.000000 | -0.025051 | -0.221198 | ... | 0.007220 | -0.037499 | 0.056076 | -0.162077 | 0.049499 | 0.037001 | -0.001196 | 0.009322 | -0.015386 | 0.061510 |
| invoicepostcode | 0.007331 | 0.011715 | 0.004702 | -0.001469 | 0.000238 | -0.017881 | 0.017520 | -0.025051 | 1.000000 | 0.011827 | ... | -0.033952 | 0.014360 | 0.000579 | 0.016125 | -0.002245 | -0.007027 | 0.004233 | 0.016541 | 0.007098 | 0.009634 |
| voucher | -0.004133 | -0.036099 | 0.007428 | -0.028032 | 0.002350 | -0.042948 | -0.063817 | -0.221198 | 0.011827 | 1.000000 | ... | -0.065736 | -0.025517 | -0.012543 | 0.126056 | -0.001013 | -0.002399 | 0.000172 | -0.034390 | -0.030023 | -0.029298 |
| case | 0.008751 | 0.054662 | 0.025803 | 0.037712 | 0.031380 | 0.106936 | -0.002971 | 0.050234 | 0.005619 | -0.382483 | ... | 0.187622 | -0.044408 | 0.069065 | -0.220079 | 0.052259 | 0.028637 | 0.021300 | 0.090156 | 0.084834 | 0.030245 |
| numberitems | 0.001103 | -0.082457 | 0.003132 | -0.013007 | 0.070124 | 0.074731 | -0.017015 | -0.005476 | -0.003444 | -0.013321 | ... | 0.254221 | 0.058759 | 0.111132 | -0.033483 | 0.115549 | 0.039547 | -0.008036 | 0.176943 | 0.049139 | 0.060062 |
| gift | 0.001854 | 0.007554 | -0.000173 | 0.001797 | -0.004980 | -0.002627 | 0.017525 | -0.034063 | -0.012364 | -0.008666 | ... | -0.010118 | -0.005047 | 0.004256 | -0.021816 | -0.004776 | -0.002759 | -0.000921 | -0.001869 | -0.005056 | -0.004247 |
| entry | -0.004161 | -0.089164 | -0.012572 | -0.015039 | 0.062586 | 0.901104 | -0.032496 | 0.318617 | -0.030854 | 0.089605 | ... | 0.021277 | -0.049409 | -0.000912 | -0.217122 | -0.004227 | 0.002963 | -0.006847 | -0.022468 | -0.013011 | 0.041292 |
| points | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| shippingcosts | -0.000619 | -0.001578 | -0.006258 | -0.006895 | -0.042458 | -0.121870 | -0.041964 | -0.211830 | -0.000931 | -0.093200 | ... | -0.047703 | -0.031389 | -0.030711 | -0.135669 | 0.051956 | 0.009880 | -0.005728 | -0.020379 | -0.012094 | -0.070894 |
| weight | -0.000008 | -0.073064 | 0.005074 | -0.007890 | 0.055792 | 0.128865 | -0.097110 | 0.022725 | -0.008013 | -0.020410 | ... | 0.240639 | -0.065648 | 0.019155 | -0.283716 | 0.019317 | -0.007185 | -0.007082 | 0.107474 | 0.020886 | 0.043502 |
| remi | 0.010021 | -0.009339 | 0.009960 | -0.006736 | 0.003458 | -0.006554 | -0.020976 | 0.022961 | 0.004620 | -0.031891 | ... | 0.027767 | 0.001011 | 0.001376 | -0.023162 | 0.008656 | 0.001978 | -0.002099 | 0.029402 | 0.017976 | 0.065579 |
| cancel | -0.003556 | 0.008864 | -0.003533 | 0.004169 | -0.005258 | -0.057497 | 0.005062 | 0.002377 | 0.004860 | -0.052566 | ... | -0.028930 | -0.014969 | -0.009017 | -0.064339 | 0.018180 | -0.002804 | -0.002732 | 0.079804 | -0.013266 | -0.014917 |
| used | 0.005534 | -0.022389 | -0.008253 | -0.011834 | 0.002346 | -0.113677 | -0.045222 | -0.073014 | 0.001377 | -0.044240 | ... | 0.051283 | -0.010819 | -0.011078 | -0.046762 | -0.009823 | -0.007223 | -0.001974 | -0.004770 | -0.016439 | 0.029418 |
| w0 | -0.008916 | 0.007940 | 0.000251 | 0.007155 | 0.019816 | 0.086105 | -0.057287 | 0.020425 | 0.009140 | -0.026944 | ... | 0.003538 | -0.040639 | -0.019600 | -0.175614 | -0.026626 | -0.016679 | -0.007416 | -0.070057 | -0.038499 | 0.016755 |
| w1 | 0.005185 | -0.036641 | 0.006470 | -0.003498 | 0.032213 | 0.061857 | -0.034977 | 0.010292 | 0.008815 | 0.040801 | ... | -0.034726 | -0.021371 | -0.015187 | -0.092368 | -0.011589 | -0.006871 | -0.003900 | -0.032253 | -0.023562 | 0.033917 |
| w2 | 0.001725 | -0.031698 | -0.010820 | -0.010177 | -0.000269 | 0.034043 | -0.028199 | 0.007220 | -0.033952 | -0.065736 | ... | 1.000000 | -0.015231 | -0.018875 | -0.065830 | -0.017953 | -0.010741 | -0.002780 | -0.035607 | -0.025303 | 0.016079 |
| w3 | -0.005051 | 0.018696 | 0.003987 | 0.011289 | 0.008138 | -0.045133 | 0.064036 | -0.037499 | 0.014360 | -0.025517 | ... | -0.015231 | 1.000000 | -0.008072 | -0.022501 | -0.006938 | -0.004291 | -0.001014 | -0.014698 | -0.011341 | 0.018920 |
| w4 | 0.008476 | 0.009198 | 0.016536 | 0.009472 | -0.003494 | -0.000218 | 0.095178 | 0.056076 | 0.000579 | -0.012543 | ... | -0.018875 | -0.008072 | 1.000000 | -0.034891 | 0.009103 | 0.027348 | -0.001473 | -0.013206 | -0.006356 | -0.007758 |
| w5 | 0.002725 | 0.023606 | -0.007854 | -0.008717 | 0.013799 | -0.192192 | 0.143398 | -0.162077 | 0.016125 | 0.126056 | ... | -0.065830 | -0.022501 | -0.034891 | 1.000000 | -0.029986 | -0.018547 | -0.004383 | -0.063529 | -0.049018 | 0.032107 |
| w6 | -0.003050 | 0.011877 | 0.005801 | 0.007259 | -0.001201 | 0.002683 | 0.082177 | 0.049499 | -0.002245 | -0.001013 | ... | -0.017953 | -0.006938 | 0.009103 | -0.029986 | 1.000000 | 0.006161 | -0.001266 | -0.006552 | -0.004558 | 0.004522 |
| w7 | 0.002238 | 0.005003 | -0.001132 | 0.004525 | -0.003705 | -0.002247 | 0.042727 | 0.037001 | -0.007027 | -0.002399 | ... | -0.010741 | -0.004291 | 0.027348 | -0.018547 | 0.006161 | 1.000000 | -0.000783 | -0.008507 | -0.004735 | -0.007082 |
| w8 | -0.001200 | 0.006041 | -0.001140 | -0.003140 | 0.024071 | -0.007910 | 0.020738 | -0.001196 | 0.004233 | 0.000172 | ... | -0.002780 | -0.001014 | -0.001473 | -0.004383 | -0.001266 | -0.000783 | 1.000000 | -0.002682 | -0.002070 | -0.000696 |
| w9 | 0.005347 | -0.004678 | 0.010506 | 0.003996 | 0.011323 | -0.027977 | 0.009638 | 0.009322 | 0.016541 | -0.034390 | ... | -0.035607 | -0.014698 | -0.013206 | -0.063529 | -0.006552 | -0.008507 | -0.002682 | 1.000000 | -0.022151 | 0.019271 |
| w10 | 0.009121 | -0.014162 | -0.008495 | 0.015541 | 0.002762 | -0.009326 | 0.027184 | -0.015386 | 0.007098 | -0.030023 | ... | -0.025303 | -0.011341 | -0.006356 | -0.049018 | -0.004558 | -0.004735 | -0.002070 | -0.022151 | 1.000000 | -0.014007 |
| target90 | 0.001242 | -0.028074 | -0.001114 | 0.008615 | 0.083011 | 0.048831 | -0.006011 | 0.061510 | 0.009634 | -0.029298 | ... | 0.016079 | 0.018920 | -0.007758 | 0.032107 | 0.004522 | -0.007082 | -0.000696 | 0.019271 | -0.014007 | 1.000000 |
32 rows × 32 columns
Image(filename="i-ii.png")
df['target90'].value_counts()
0 26377 1 6051 Name: target90, dtype: int64
import seaborn as sns
sns.countplot(df['target90'])
<AxesSubplot:xlabel='target90', ylabel='count'>
from sklearn.utils import resample
#create two different dataframe of majority and minority class
df_majority = df[(df['target90']==0)]
df_minority = df[(df['target90']==1)]
# upsample minority class
df_minority_upsampled = resample(df_minority,
replace=True, # sample with replacement
n_samples= 121797, # to match majority class
random_state=42) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_minority_upsampled, df_majority])
df_upsampled['target90'].value_counts()
1 121797 0 26377 Name: target90, dtype: int64
sns.countplot(df_upsampled['target90'])
<AxesSubplot:xlabel='target90', ylabel='count'>
df['target90'].isna().sum()
0
df['target90'].value_counts()
0 26377 1 6051 Name: target90, dtype: int64
#PART 1
# import SMOTE oversampling and other necessary libraries
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
#import data
url = "https://raw.githubusercontent.com/jackty9/Handling_Imbalanced_Data_in_Python/master/bank-full-encoded.csv"
df = pd.read_csv(url)
# Separating the independent variables from dependent variables
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
#Split train-test data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)
# summarize class distribution
print("Before oversampling: ",Counter(y_train))
# define oversampling strategy
SMOTE = SMOTE()
# fit and apply the transform
X_train_SMOTE, y_train_SMOTE = SMOTE.fit_resample(X_train, y_train)
# summarize class distribution
print("After oversampling: ",Counter(y_train_SMOTE))
#PART 2
# import SVM libraries
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
model=SVC()
clf_SMOTE = model.fit(X_train_SMOTE, y_train_SMOTE)
pred_SMOTE = clf_SMOTE.predict(X_test)
print("ROC AUC score for oversampled SMOTE data: ", roc_auc_score(y_test, pred_SMOTE))
Before oversampling: Counter({0: 27916, 1: 3731})
After oversampling: Counter({0: 27916, 1: 27916})
ROC AUC score for oversampled SMOTE data: 0.7474275271435741
#import data
url = "https://raw.githubusercontent.com/jackty9/Handling_Imbalanced_Data_in_Python/master/bank-full-encoded.csv"
df = pd.read_csv(url)
# Separating the independent variables from dependent variables
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
#Split train-test data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30)
# summarize class distribution
print("Before undersampling: ", Counter(y_train))
# define undersampling strategy
undersample = RandomUnderSampler(sampling_strategy='majority')
# fit and apply the transform
X_train_under, y_train_under = undersample.fit_resample(X_train, y_train)
# summarize class distribution
print("After undersampling: ", Counter(y_train_under))
#PART 2
# import SVM libraries
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score
model=SVC()
clf_under = model.fit(X_train_under, y_train_under)
pred_under = clf_under.predict(X_test)
print("ROC AUC score for undersampled data: ", roc_auc_score(y_test, pred_under))
Before undersampling: Counter({0: 27964, 1: 3683})
After undersampling: Counter({0: 3683, 1: 3683})
ROC AUC score for undersampled data: 0.7278310325241708
#PART 1
# import sampling and other necessary libraries
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
#import data
url = "https://raw.githubusercontent.com/jackty9/Handling_Imbalanced_Data_in_Python/master/bank-full-encoded.csv"
df = pd.read_csv(url)
# Separating the independent variables from dependent variables
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
# define pipeline
model = SVC()
over = SMOTE(sampling_strategy=0.4)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under), ('model', model)]
pipeline = Pipeline(steps=steps)
#PART 2
# import libraries for evaluation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from numpy import mean
# evaluate pipeline
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=5, n_jobs=-1)
score = mean(scores)
print('ROC AUC score for the combined sampling method: %.3f' % score)
ROC AUC score for the combined sampling method: 0.819
from IPython.display import Image
Image(filename="Normalization.png")
Image(filename="Standarization.png")
Image(filename="Decimal-Scale.png")
import pandas as pd
# data frame containing the odometer reading (km) and the fuel economy (km/l) of second-hand cars
df_cars = pd.DataFrame([[120000, 11], [250000, 11.5], [175000, 15.8], [350000, 17], [400000, 10]],
columns=['odometer_reading', 'fuel_economy'])
df_cars
| odometer_reading | fuel_economy | |
|---|---|---|
| 0 | 120000 | 11.0 |
| 1 | 250000 | 11.5 |
| 2 | 175000 | 15.8 |
| 3 | 350000 | 17.0 |
| 4 | 400000 | 10.0 |
# apply the min-max scaling in Pandas using the .min() and .max() methods
def min_max_scaling(df):
# copy the dataframe
df_norm = df.copy()
# apply min-max scaling
for column in df_norm.columns:
df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())
return df_norm
# call the min_max_scaling function
df_cars_normalized = min_max_scaling(df_cars)
df_cars_normalized
| odometer_reading | fuel_economy | |
|---|---|---|
| 0 | 0.000000 | 0.142857 |
| 1 | 0.464286 | 0.214286 |
| 2 | 0.196429 | 0.828571 |
| 3 | 0.821429 | 1.000000 |
| 4 | 1.000000 | 0.000000 |
from sklearn.preprocessing import MinMaxScaler
#create scalar object
scaler = MinMaxScaler()
#fit and transform the data we have
df_norm = pd.DataFrame(scaler.fit_transform(df_cars),columns=df_cars.columns)
df_norm
| odometer_reading | fuel_economy | |
|---|---|---|
| 0 | 0.000000 | 0.142857 |
| 1 | 0.464286 | 0.214286 |
| 2 | 0.196429 | 0.828571 |
| 3 | 0.821429 | 1.000000 |
| 4 | 1.000000 | 0.000000 |
def z_score(df):
for col in df.columns:
col_zscore = col +'_zscore'
df[col_zscore] =( df[col] -df[col].mean()) / df[col].std(ddof = 0)
return df
# calling to the function z-score
df_cars_Standardized = z_score(df_cars)
df_cars_Standardized
| odometer_reading | fuel_economy | odometer_reading_zscore | |
|---|---|---|---|
| 0 | 120000 | 11.0 | -1.329915 |
| 1 | 250000 | 11.5 | -0.086110 |
| 2 | 175000 | 15.8 | -0.803690 |
| 3 | 350000 | 17.0 | 0.870664 |
| 4 | 400000 | 10.0 | 1.349051 |
import scipy.stats as stats
temp = df_cars.copy()
temp.apply(stats.zscore)
| odometer_reading | fuel_economy | odometer_reading_zscore | |
|---|---|---|---|
| 0 | -1.329915 | -0.736918 | -1.329915 |
| 1 | -0.086110 | -0.558055 | -0.086110 |
| 2 | -0.803690 | 0.980173 | -0.803690 |
| 3 | 0.870664 | 1.409446 | 0.870664 |
| 4 | 1.349051 | -1.094646 | 1.349051 |
def Decimal_scale(df):
temp = df.copy()
for x in df:
p = df[x].max()
q = len(str(abs(p)))
df[x] = df[x]/10**q
Decimal_scale(df_cars)
df_cars
| odometer_reading | fuel_economy | odometer_reading_zscore | |
|---|---|---|---|
| 0 | 0.120 | 0.00110 | -1.329915e-18 |
| 1 | 0.250 | 0.00115 | -8.610962e-20 |
| 2 | 0.175 | 0.00158 | -8.036898e-19 |
| 3 | 0.350 | 0.00170 | 8.706639e-19 |
| 4 | 0.400 | 0.00100 | 1.349051e-18 |
Image(filename="Discarization.png")
#equal frequency
def equifreq(arr1, m):
a = len(arr1)
n = int(a / m)
for i in range(0, m):
arr = []
for j in range(i * n, (i + 1) * n):
if j >= a:
break
arr = arr + [arr1[j]]
print(arr)
array = [5, 10, 11, 13, 15, 35, 50, 55, 72, 92, 204, 215]
equifreq(array, 3)
[5, 10, 11, 13] [15, 35, 50, 55] [72, 92, 204, 215]
#define function to calculate equal-frequency bins
def equalObs(x, nbin):
nlen = len(x)
return np.interp(np.linspace(0, nlen, nbin + 1),
np.arange(nlen),
np.sort(x))
import numpy as np
import matplotlib.pyplot as plt
#create data
np.random.seed(1)
data = np.random.randn(50)
#create histogram with equal-frequency bins
n, bins, patches = plt.hist(data, equalObs(data, 10), edgecolor='black')
plt.xlabel('Bin boundaries', size = 5)
plt.ylabel('Frequency per bin', size = 1)
plt.show()
#display bin boundaries and frequency per bin
bins, n
(array([-2.3015387 , -1.07296862, -0.7612069 , -0.67124613, -0.38405435,
-0.17242821, 0.12015895, 0.53035547, 0.90085595, 1.62434536,
2.10025514]),
array([5., 5., 5., 5., 5., 5., 5., 5., 5., 5.]))
#equal width
def equiwidth(arr1, m):
a = len(arr1)
w = int((max(arr1) - min(arr1)) / m)
min1 = min(arr1)
arr = []
for i in range(0, m + 1):
arr = arr + [min1 + w * i]
arri=[]
for i in range(0, m):
temp = []
for j in arr1:
if j >= arr[i] and j <= arr[i+1]:
temp += [j]
arri += [temp]
print(arri)
equiwidth(array, 3)
[[5, 10, 11, 13, 15, 35, 50, 55, 72], [92], [204, 215]]
#create histogram with equal-width bins
n, bins, patches = plt.hist(data, edgecolor='black')
plt.xlabel('Bin boundaries', size = 14)
plt.ylabel('Frequency per bin', size = 14)
plt.show()
#display bin boundaries and frequency per bin
bins, n
(array([-2.3015387 , -1.86135931, -1.42117993, -0.98100055, -0.54082116,
-0.10064178, 0.3395376 , 0.77971699, 1.21989637, 1.66007575,
2.10025514]),
array([ 2., 0., 4., 12., 9., 7., 5., 5., 3., 3.]))
Image(filename="Smoothing.png")
Image(filename="Smoothing-ex.png")
Image(filename="Smoothing-methods.png")
Image(filename="Smooth-by-bin.png")
#import pandas as pd
#import numpy as np
product = {'month' : [1,2,3,4,5,6,7,8,9,10,11,12],'demand':[290,260,288,300,310,303,329,340,316,330,308,310]}
temp_df = pd.DataFrame(product)
temp_df
| month | demand | |
|---|---|---|
| 0 | 1 | 290 |
| 1 | 2 | 260 |
| 2 | 3 | 288 |
| 3 | 4 | 300 |
| 4 | 5 | 310 |
| 5 | 6 | 303 |
| 6 | 7 | 329 |
| 7 | 8 | 340 |
| 8 | 9 | 316 |
| 9 | 10 | 330 |
| 10 | 11 | 308 |
| 11 | 12 | 310 |
# moving average for 3 months window periods
temp_df['SMA 3'] = temp_df.iloc[:,1].rolling(window=3).mean()
temp_df
| month | demand | SMA 3 | |
|---|---|---|---|
| 0 | 1 | 290 | NaN |
| 1 | 2 | 260 | NaN |
| 2 | 3 | 288 | 279.333333 |
| 3 | 4 | 300 | 282.666667 |
| 4 | 5 | 310 | 299.333333 |
| 5 | 6 | 303 | 304.333333 |
| 6 | 7 | 329 | 314.000000 |
| 7 | 8 | 340 | 324.000000 |
| 8 | 9 | 316 | 328.333333 |
| 9 | 10 | 330 | 328.666667 |
| 10 | 11 | 308 | 318.000000 |
| 11 | 12 | 310 | 316.000000 |
def lwma(Data, period):
weighted = []
for i in range(len(Data)):
try:
total = np.arange(1, period + 1, 1) # weight matrix
matrix = Data[i - period + 1: i + 1, 3:4]
matrix = np.ndarray.flatten(matrix)
matrix = total * matrix # multiplication
wma = (matrix.sum()) / (total.sum()) # WMA
weighted = np.append(weighted, wma) # add to array
except ValueError:
pass
return weighted
def ema(Data, alpha, window, what, whereSMA, whereEMA):
# alpha is the smoothing factor
# window is the lookback period
# what is the column that needs to have its average calculated
# where is where to put the exponential moving average
alpha = alpha / (window + 1.0)
beta = 1 - alpha
# First value is a simple SMA
Data[window - 1, whereSMA] = np.mean(Data[:window - 1, what])
# Calculating first EMA
Data[window, whereEMA] = (Data[window, what] * alpha) + (Data[window - 1, whereSMA] * beta)
# Calculating the rest of EMA
for i in range(window + 1, len(Data)):
try:
Data[i, whereEMA] = (Data[i, what] * alpha) + (Data[i - 1, whereEMA] * beta)
except IndexError:
pass
return Data
import numpy as np
import math
from sklearn.datasets import load_iris
from sklearn import datasets, linear_model, metrics
# load iris data set
dataset = load_iris()
a = dataset.data
b = np.zeros(150)
# take 1st column among 4 column of data set
for i in range (150):
b[i]=a[i,1]
b=np.sort(b) #sort the array
# create bins
bin1=np.zeros((30,5))
bin2=np.zeros((30,5))
# Bin mean
for i in range (0,150,5):
k=int(i/5)
mean=(b[i] + b[i+1] + b[i+2] + b[i+3] + b[i+4])/5
for j in range(5):
bin1[k,j]=mean
print("Bin Mean: \n",bin1)
Bin Mean: [[2.18 2.18 2.18 2.18 2.18] [2.34 2.34 2.34 2.34 2.34] [2.48 2.48 2.48 2.48 2.48] [2.52 2.52 2.52 2.52 2.52] [2.62 2.62 2.62 2.62 2.62] [2.7 2.7 2.7 2.7 2.7 ] [2.74 2.74 2.74 2.74 2.74] [2.8 2.8 2.8 2.8 2.8 ] [2.8 2.8 2.8 2.8 2.8 ] [2.86 2.86 2.86 2.86 2.86] [2.9 2.9 2.9 2.9 2.9 ] [2.96 2.96 2.96 2.96 2.96] [3. 3. 3. 3. 3. ] [3. 3. 3. 3. 3. ] [3. 3. 3. 3. 3. ] [3. 3. 3. 3. 3. ] [3.04 3.04 3.04 3.04 3.04] [3.1 3.1 3.1 3.1 3.1 ] [3.12 3.12 3.12 3.12 3.12] [3.2 3.2 3.2 3.2 3.2 ] [3.2 3.2 3.2 3.2 3.2 ] [3.26 3.26 3.26 3.26 3.26] [3.34 3.34 3.34 3.34 3.34] [3.4 3.4 3.4 3.4 3.4 ] [3.4 3.4 3.4 3.4 3.4 ] [3.5 3.5 3.5 3.5 3.5 ] [3.58 3.58 3.58 3.58 3.58] [3.74 3.74 3.74 3.74 3.74] [3.82 3.82 3.82 3.82 3.82] [4.12 4.12 4.12 4.12 4.12]]
for i in range (0,150,5):
k=int(i/5)
for j in range (5):
if (b[i+j]-b[i]) < (b[i+4]-b[i+j]):
bin2[k,j]=b[i]
else:
bin2[k,j]=b[i+4]
print("Bin Boundaries: \n",bin2)
Bin Boundaries: [[2. 2.3 2.3 2.3 2.3] [2.3 2.3 2.3 2.4 2.4] [2.4 2.5 2.5 2.5 2.5] [2.5 2.5 2.5 2.5 2.6] [2.6 2.6 2.6 2.6 2.7] [2.7 2.7 2.7 2.7 2.7] [2.7 2.7 2.7 2.8 2.8] [2.8 2.8 2.8 2.8 2.8] [2.8 2.8 2.8 2.8 2.8] [2.8 2.8 2.9 2.9 2.9] [2.9 2.9 2.9 2.9 2.9] [2.9 2.9 3. 3. 3. ] [3. 3. 3. 3. 3. ] [3. 3. 3. 3. 3. ] [3. 3. 3. 3. 3. ] [3. 3. 3. 3. 3. ] [3. 3. 3. 3.1 3.1] [3.1 3.1 3.1 3.1 3.1] [3.1 3.1 3.1 3.1 3.2] [3.2 3.2 3.2 3.2 3.2] [3.2 3.2 3.2 3.2 3.2] [3.2 3.2 3.3 3.3 3.3] [3.3 3.3 3.3 3.4 3.4] [3.4 3.4 3.4 3.4 3.4] [3.4 3.4 3.4 3.4 3.4] [3.5 3.5 3.5 3.5 3.5] [3.5 3.6 3.6 3.6 3.6] [3.7 3.7 3.7 3.8 3.8] [3.8 3.8 3.8 3.8 3.9] [3.9 3.9 3.9 4.4 4.4]]